Set the working dir
setwd("/mnt/picea/projects/arabidopsis/mschmid/porcupine-RNA-Seq")
Libs
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(Glimma))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(Mfuzz))
suppressPackageStartupMessages(library(org.At.tair.db))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(VennDiagram))
Helper files
suppressMessages(source("~/Git/UPSCb/src/R/densityPlot.R"))
suppressMessages(source("~/Git/UPSCb/src/R/plotMA.R"))
suppressMessages(source("~/Git/UPSCb/src/R/volcanoPlot.R"))
suppressMessages(source("~/Git/UPSCb/src/R/plot.multidensity.R"))
Load saved data Read the sample information
samples <- read.csv("~/Git/UPSCb/projects/arabidopsis-porcupine-RNA-Seq/doc/RNA-seq_pcp_vs_Col-0_info.csv")
And extend it
samples$Genotype=sub(" .*","",samples$Description)
samples$Temp=ifelse(sub(".* ","",samples$Description)=="16",
ifelse(nchar(as.character(samples$Description))>16,"WC","C"),
ifelse(nchar(as.character(samples$Description))>16,"CW","W"))
Gene raw expression
cg <- read.csv("analysis/kallisto/combined-raw-unormalised-gene-expression_data.csv",
row.names=1)
Transcript raw expression
ct <- read.csv("analysis/kallisto/combined-raw-unormalised-transcript-expression_data.csv",
row.names=1)
Gene normalised expression
vst.g <- read.csv("analysis/kallisto/combined-library-size-normalized_variance-stabilized_gene-expression_data.csv",
row.names=1)
Transcript raw expression
vst.t <- read.csv("analysis/kallisto/combined-library-size-normalized_variance-stabilized_transcript-expression_data.csv",
row.names=1)
Setup graphics
pal=brewer.pal(8,"Dark2")
mar <- par("mar")
This is pcp
pcp <- "AT2G18740"
Re-order the samples
stopifnot(all(colnames(cg) == colnames(vst.g)))
stopifnot(all(colnames(ct) == colnames(vst.t)))
stopifnot(all(colnames(ct) == colnames(cg)))
colnames(ct) <- colnames(cg) <- colnames(vst.t) <- colnames(vst.g) <- sub("X","",colnames(cg))
samples <- samples[match(colnames(ct),samples$SampleID),]
df.annot <- data.frame(GeneID=rownames(cg),
SYMBOL=mapIds(org.At.tair.db,keys=rownames(cg),"SYMBOL","TAIR"),
GENENAME=mapIds(org.At.tair.db,keys=rownames(cg),"GENENAME","TAIR"))
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
tx.annot <- df.annot[match(sub("\\.[0-9]+","",rownames(vst.t)),df.annot$GeneID),]
row.names(tx.annot) <- rownames(vst.t)
tx.annot$TranscriptID <- rownames(vst.t)
ord <- order(samples$Genotype,samples$Temp)
plotmatrix(as.matrix(vst.g[pcp,ord]),main="pcp gene expression",
ylab="vst expression",xlabels = paste(samples$Genotype,
samples$Temp,sep="-")[ord],
las=2,col=pal)
Same as a dotchart
dat <- split(unlist(vst.g[pcp,]),paste(samples$Genotype,
samples$Temp,sep="-"))
In the dotcharts, the coloured dots represent the mean expression of the given condition
dotchart(do.call("cbind",dat),
gdata=sapply(dat,mean),pch = 1,
labels = "",gpch = 19,gcolor = pal,
xlab="vst expression",
main="pcp gene expression")
plotmatrix(as.matrix(vst.t[grep(pcp,rownames(vst.t)),ord]),main="pcp gene expression",
ylab="vst expression",xlabels = paste(samples$Genotype,
samples$Temp,sep="-")[ord],
las=2,col=pal)
In the dotcharts, the coloured dots represent the mean expression of the given condition
dev.null <- sapply(grep(pcp,rownames(vst.t)),function(pos){
dat <- split(unlist(vst.t[pos,]),
paste(samples$Genotype,
samples$Temp,sep="-"))
dotchart(do.call("cbind",dat),
gdata=sapply(dat,mean),pch = 1,
labels = "",gpch = 19,gcolor = pal,
xlab="vst expression",
main=paste("pcp",rownames(vst.t)[pos],"expression"))
})
dds.g <- DESeqDataSetFromMatrix(
countData = cg,
colData = data.frame(genotype=samples$Genotype,
age=samples$Age.in.days,
temp=samples$Temp,
finalTemp=factor(substr(samples$Temp,
nchar(samples$Temp),
nchar(samples$Temp)))),
design = ~genotype * finalTemp)
Differential Expression
suppressMessages(dds.g <- DESeq(dds.g))
Dispersion Estimation
The dispersion estimation is adequate
plotDispEsts(dds.g)
res <- results(dds.g,name="genotype_pcp_vs_Col.0")
assume a 1% FDR
alpha=0.01
Plot the Median vs Average
There are many genes that appear differentially expressed at a 1% FDR cutoff The log2 fold-change range is relatively broad, with three extreme values
plotMA(res,alpha)
Plot the log odds vs. log2 fold change
The volcano plot shows the same results as the MA plot; a large number of genes show significant fold-changes
volcanoPlot(res,alpha=alpha)
Plot the adjusted p-value histogram
Which is almost evenly distributed, with an enrichment for lower p-values (more significant)
hist(res$padj,breaks=seq(0,1,.01))
Select genes below alpha
Note the 0.5 cutoff on the log fold change is motivated by the Schurch et al. RNA, 2016 publication
sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= 0.5
sprintf("There are %s genes differentially expressed at a %s cutoff",
sum(sel),alpha)
## [1] "There are 1440 genes differentially expressed at a 0.01 cutoff"
Equally distributed between Wt (-1) and pcp (1)
pander(table(sign(res[sel,"log2FoldChange"])))
| -1 | 1 |
|---|---|
| 734 | 706 |
Write them out
dir.create("analysis/DESeq2/Kallisto-gene/genotype-effect",
showWarnings=FALSE,recursive=TRUE)
write.csv(df.annot[sel,],
file=file.path("analysis/DESeq2/Kallisto-gene/genotype-effect",
"pcp-vs-wt_one-percent-FDR-cutoff_significant-genes.csv"))
write.csv(df.annot[sel,][sign(res$log2FoldChange[sel])==1,],
file=file.path("analysis/DESeq2/Kallisto-gene/genotype-effect",
"pcp-up-regulated_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(df.annot[sign(res$log2FoldChange[sel])==-1,],
file=file.path("analysis/DESeq2/Kallisto-gene/genotype-effect",
"wt-up-regulated_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(cbind(as.data.frame(res),df.annot),
file=file.path("analysis/DESeq2/Kallisto-gene/genotype-effect",
"pcp_vs_wt.csv"))
pcp.vs.wt <- rownames(res)[sel]
The color code for the column next to the row dendogram (genes) is yellow for significant positive fold change and darkorange for significant negative fold change
heatmap.2(as.matrix(vst.g[rownames(vst.g) %in% pcp.vs.wt,]),
scale="row",labRow=NA,trace="none",
RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
"darkorange",
"yellow"),
labCol=paste(colData(dds.g)$genotype,colData(dds.g)$temp,sep="-"))
res.df <- as.data.frame(res)
res.df$log10MeanNormCount <- log10(res.df$baseMean)
idx <- rowSums(counts(dds.g)) > 0
res.df <- res.df[idx,]
res.df$padj[is.na(res.df$padj)] <- 1
glMDPlot(res.df,
xval = "log10MeanNormCount",
yval = "log2FoldChange",
counts = counts(dds.g)[idx,],
anno = df.annot[idx,],
groups = dds.g$genotype,
samples = paste(samples$Genotype,samples$Temp,samples$Replicate,sep="-"),
status = sel[idx],
display.columns = c("GeneID", "SYMBOL","GENENAME"),
path = "analysis/DESeq2/Kallisto-gene/genotype-effect/",
folder = "report",launch=FALSE)
res <- results(dds.g,name="finalTemp_W_vs_C")
assume a 1% FDR
alpha=0.01
There are many genes that appear differentially expressed. The log2 fold-change range is relatively broad, with two extreme positive values
plotMA(res,alpha)
Plot the log odds vs. log2 fold change
The volcano plot shows the same results as the MA plot; a large number of genes show significant fold-changes
volcanoPlot(res,alpha=alpha)
Plot the adjusted p-value histogram
Which is almost evenly distributed, with an enrichment for lower p-values (most significant)
hist(res$padj,breaks=seq(0,1,.01),xlab="FDR")
Select genes below alpha
Note the 0.5 cutoff on the log fold change is motivated by the Schurch et al. RNA, 2016 publication
sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= 0.5
sprintf("There are %s genes differentially expressed at a %s cutoff",
sum(sel),alpha)
## [1] "There are 1903 genes differentially expressed at a 0.01 cutoff"
Slightly skewed towards over-expression in cold (-1) than in warm (1)
pander(table(sign(res[sel,"log2FoldChange"])))
| -1 | 1 |
|---|---|
| 1066 | 837 |
Write them out
dir.create("analysis/DESeq2/Kallisto-gene/temperature-effect",showWarnings=FALSE)
write.csv(df.annot[sel,],
file=file.path("analysis/DESeq2/Kallisto-gene/temperature-effect",
"Warm-vs-Cold-final-temperature_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(df.annot[sel,][sign(res$log2FoldChange[sel])==1,],
file=file.path("analysis/DESeq2/Kallisto-gene/temperature-effect",
"Warm-final-temperature-up-regulated_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(df.annot[sel,][sign(res$log2FoldChange[sel])==-1,],
file=file.path("analysis/DESeq2/Kallisto-gene/temperature-effect",
"Cold-final-temperature-up-regulated_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(cbind(as.data.frame(res),df.annot),
file=file.path("analysis/DESeq2/Kallisto-gene/temperature-effect",
"Warm-vs-Cold-final-temperature.csv"))
warm.vs.cold <- rownames(res)[sel]
The color code for the column next to the row dendogram (genes) is yellow for significant positive fold change and dark orange for significant negative fold change
heatmap.2(as.matrix(vst.g[rownames(vst.g) %in% warm.vs.cold,]),
scale="row",labRow=NA,trace="none",
RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
"darkorange",
"yellow"),
labCol=paste(colData(dds.g)$genotype,colData(dds.g)$temp,sep="-"))
res.df <- as.data.frame(res)
res.df$log10MeanNormCount <- log10(res.df$baseMean)
idx <- rowSums(counts(dds.g)) > 0
res.df <- res.df[idx,]
res.df$padj[is.na(res.df$padj)] <- 1
glMDPlot(res.df,
xval = "log10MeanNormCount",
yval = "log2FoldChange",
counts = counts(dds.g)[idx,],
anno = df.annot[idx,],
groups = dds.g$genotype,
samples = paste(samples$Genotype,samples$Temp,samples$Replicate,sep="-"),
status = sel[idx],
display.columns = c("GeneID", "SYMBOL","GENENAME"),
path = "analysis/DESeq2/Kallisto-gene/temperature-effect",
folder = "report",launch=FALSE)
res <- results(dds.g,name="genotypepcp.finalTempW")
assume a 1% FDR
alpha=0.01
There are not many genes that appear differentially expressed. The log2 fold-change range is relatively broad, with one extreme value.
plotMA(res,alpha)
Plot the log odds vs. log2 fold change
The volcano plot shows the same results as the MA plot; a large number of genes show large insignificant fold-changes in term of the interaction of pcp and final temperature
volcanoPlot(res,alpha=alpha)
Plot the adjusted p-value histogram
Which is almost evenly distributed, with an enrichment for higher p-values (less significant)
hist(res$padj,breaks=seq(0,1,.01),xlab="FDR")
Select genes below alpha
Note the 0.5 cutoff on the log fold change is motivated by the Schurch et al. RNA, 2016 publication
sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= 0.5
sprintf("There are %s genes differentially expressed at a %s cutoff",
sum(sel),alpha)
## [1] "There are 319 genes differentially expressed at a 0.01 cutoff"
Almost evenly distributed
pander(table(sign(res[sel,"log2FoldChange"])))
| -1 | 1 |
|---|---|
| 161 | 158 |
Write them out
dir.create("analysis/DESeq2/Kallisto-gene/interaction-term",
showWarnings = FALSE)
write.csv(df.annot[sel,],
file=file.path("analysis/DESeq2/Kallisto-gene/interaction-term",
"pcp-final-temperature-interaction_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(df.annot[sel,][sign(res$log2FoldChange[sel])==1,],
file=file.path("analysis/DESeq2/Kallisto-gene/interaction-term",
"pcp-final-temperature-interaction-up-regulated_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(df.annot[sel,][sign(res$log2FoldChange[sel])==-1,],
file=file.path("analysis/DESeq2/Kallisto-gene/interaction-term",
"pcp-final-temperature-interaction-down-regulated_one-percent-FDR-cutoff_significant-genes.txt"))
write.csv(cbind(as.data.frame(res),df.annot),
file=file.path("analysis/DESeq2/Kallisto-gene/interaction-term",
"pcp-final-temperature-interaction.csv"))
pcp.temperature <- rownames(res)[sel]
The color code for the column next to the row dendogram (genes) is yellow for significant positive fold change and dark orange for significant negative fold change
heatmap.2(as.matrix(vst.g[rownames(vst.g) %in% pcp.temperature,]),
scale="row",labRow=NA,trace="none",
RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
"darkorange",
"yellow"),
labCol=paste(colData(dds.g)$genotype,colData(dds.g)$temp,sep="-"))
res.df <- as.data.frame(res)
res.df$log10MeanNormCount <- log10(res.df$baseMean)
idx <- rowSums(counts(dds.g)) > 0
res.df <- res.df[idx,]
res.df$padj[is.na(res.df$padj)] <- 1
glMDPlot(res.df,
xval = "log10MeanNormCount",
yval = "log2FoldChange",
counts = counts(dds.g)[idx,],
anno = df.annot[idx,],
groups = dds.g$genotype,
samples = paste(samples$Genotype,samples$Temp,samples$Replicate,sep="-"),
status = sel[idx],
display.columns = c("GeneID", "SYMBOL","GENENAME"),
path = "analysis/DESeq2/Kallisto-gene/interaction-term",
folder = "report",launch=FALSE)
Create the eSet
eset <- ExpressionSet(as.matrix(vst.g[sel,]))
Standardise
eset.s <- standardise(eset)
Average the replicates (mean)
eset.m <- ExpressionSet(
sapply(split.data.frame(t(exprs(eset.s)),
paste0(samples$Genotype,samples$Temp)),
colMeans))
Estimate the fuzzification
m <- mestimate(eset.m)
Find the clusters (12 is based on the previous heatmap)
cl <- mfuzz(eset.m,centers=12,m=m)
Order the replicates
ord <- c(4,1:3,8,5:7)
There are a number of clusters that behave similarly
mfuzz.plot(eset.m[,ord],
cl=cl,
mfrow=c(1,1),
time.labels = colnames(eset.m)[ord],
new.window=FALSE)
write.csv(cbind(as.data.frame(res),df.annot),
file=file.path("analysis/DESeq2/Kallisto-gene/interaction-term",
"pcp-final-temperature-interaction.csv"))
Writing the core genes from the clusters
dir.create("analysis/DESeq2/Kallisto-gene/interaction-term/clusters",
showWarnings = FALSE)
Not all genes will be part of the core genes from the clusters
dev.null <- sapply(1:12,function(i,d){
nam <- d[[i]]
write.csv(cbind(ID=nam,df.annot[nam,]),
file=file.path("analysis/DESeq2/Kallisto-gene/interaction-term/clusters",
paste("cluster",i,sep="-")))
},lapply(sapply(acore(eset.m,cl,0.5),"[[","NAME"),as.character))
plot.new()
grid.draw(venn.diagram(list(
pcp.vs.wt=pcp.vs.wt,
warm.vs.cold=warm.vs.cold,
pcp.temperature=pcp.temperature),
filename=NULL,
col=pal[1:3],
category.names=c("genotype","temperature","interaction")))
compareKallistoHTSeq <- function(kfile,hfile){
k <- read.csv(kfile,row.names=1)
h <- read.csv(hfile,row.names=1)
k <- k[match(sub("\\.0$","",rownames(h)),rownames(k)),]
heatscatter(k[,"log2FoldChange"],h[,"log2FoldChange"],main="Log2 Fold-changes comparison",
xlab="Kallisto",ylab="HTSeq",cor=TRUE)
abline(0,1,lty=2,col="gray")
qqplot(k[,"log2FoldChange"],h[,"log2FoldChange"],main="QQplot",xlab="Kallisto",ylab="HTSeq")
abline(0,1,lty=2,col="gray")
list(spearman=suppressWarnings(cor.test(k[,"log2FoldChange"],h[,"log2FoldChange"],method="spearman")),
wilcoxon=wilcox.test(k[,"log2FoldChange"],h[,"log2FoldChange"]),
friedman=friedman.test(cbind(k[,"log2FoldChange"],h[,"log2FoldChange"])),
quade=quade.test(cbind(k[,"log2FoldChange"],h[,"log2FoldChange"])))
}
compareKallistoHTSeq("analysis/DESeq2/Kallisto-gene/genotype-effect/pcp_vs_wt.csv",
"analysis/DESeq2/pcp_vs_wt.csv")
## $spearman
##
## Spearman's rank correlation rho
##
## data: k[, "log2FoldChange"] and h[, "log2FoldChange"]
## S = 5.5816e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9760598
##
##
## $wilcoxon
##
## Wilcoxon rank sum test with continuity correction
##
## data: k[, "log2FoldChange"] and h[, "log2FoldChange"]
## W = 296760000, p-value = 0.4028
## alternative hypothesis: true location shift is not equal to 0
##
##
## $friedman
##
## Friedman rank sum test
##
## data: cbind(k[, "log2FoldChange"], h[, "log2FoldChange"])
## Friedman chi-squared = 6143.8, df = 1, p-value < 2.2e-16
##
##
## $quade
##
## Quade test
##
## data: cbind(k[, "log2FoldChange"], h[, "log2FoldChange"])
## Quade F = 4435.9, num df = 1, denom df = 27415, p-value < 2.2e-16
h.pcp.vs.wt <- sub("\\.0","",scan(what="character",
file="analysis/DESeq2/pcp-vs-wt_one-percent-FDR-cutoff_significant-genes.txt"))
plot.new()
grid.draw(venn.diagram(list(
kallisto=pcp.vs.wt,
htseq=h.pcp.vs.wt),
filename=NULL,
col=pal[1:2],
category.names=c("kallisto (pcp)","htseq (pcp)")))
compareKallistoHTSeq("analysis/DESeq2/Kallisto-gene/Warm-vs-Cold-final-temperature.csv","analysis/DESeq2/Warm-vs-Cold-final-temperature.csv")
## $spearman
##
## Spearman's rank correlation rho
##
## data: k[, "log2FoldChange"] and h[, "log2FoldChange"]
## S = 5.6955e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9755713
##
##
## $wilcoxon
##
## Wilcoxon rank sum test with continuity correction
##
## data: k[, "log2FoldChange"] and h[, "log2FoldChange"]
## W = 298540000, p-value = 0.758
## alternative hypothesis: true location shift is not equal to 0
##
##
## $friedman
##
## Friedman rank sum test
##
## data: cbind(k[, "log2FoldChange"], h[, "log2FoldChange"])
## Friedman chi-squared = 337.34, df = 1, p-value < 2.2e-16
##
##
## $quade
##
## Quade test
##
## data: cbind(k[, "log2FoldChange"], h[, "log2FoldChange"])
## Quade F = 17.175, num df = 1, denom df = 27415, p-value =
## 3.419e-05
h.warm.vs.cold <- sub("\\.0","",scan(what="character",file="analysis/DESeq2/Warm-vs-Cold-final-temperature_one-percent-FDR-cutoff_significant-genes.txt"))
plot.new()
grid.draw(venn.diagram(list(
kallisto=warm.vs.cold,
htseq=h.warm.vs.cold),
filename=NULL,
col=pal[1:2],
category.names=c("kallisto (temp)","htseq (temp)")))
compareKallistoHTSeq("analysis/DESeq2/Kallisto-gene/pcp-final-temperature-interaction.csv","analysis/DESeq2/pcp-final-temperature-interaction.csv")
## $spearman
##
## Spearman's rank correlation rho
##
## data: k[, "log2FoldChange"] and h[, "log2FoldChange"]
## S = 7.8345e+10, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9663966
##
##
## $wilcoxon
##
## Wilcoxon rank sum test with continuity correction
##
## data: k[, "log2FoldChange"] and h[, "log2FoldChange"]
## W = 299430000, p-value = 0.3791
## alternative hypothesis: true location shift is not equal to 0
##
##
## $friedman
##
## Friedman rank sum test
##
## data: cbind(k[, "log2FoldChange"], h[, "log2FoldChange"])
## Friedman chi-squared = 1588.7, df = 1, p-value < 2.2e-16
##
##
## $quade
##
## Quade test
##
## data: cbind(k[, "log2FoldChange"], h[, "log2FoldChange"])
## Quade F = 751.04, num df = 1, denom df = 27415, p-value < 2.2e-16
h.pcp.temperature <- sub("\\.0","",scan(what="character",file="analysis/DESeq2/pcp-final-temperature-interaction_one-percent-FDR-cutoff_significant-genes.txt"))
plot.new()
grid.draw(venn.diagram(list(
kallisto=pcp.temperature,
htseq=h.pcp.temperature),
filename=NULL,
col=pal[1:2],
category.names=c("kallisto (temp)","htseq (temp)")))
The results of the gene DE analysis are very similar to those obtained from STAR + HTSeq. The difference in using Kallisto probably arises from it rescuing multi-mapping reads.
dds.t <- DESeqDataSetFromMatrix(
countData = ct,
colData = data.frame(genotype=samples$Genotype,
age=samples$Age.in.days,
temp=samples$Temp,
finalTemp=factor(substr(samples$Temp,
nchar(samples$Temp),
nchar(samples$Temp)))),
design = ~genotype * finalTemp)
Differential Expression
suppressMessages(dds.t <- DESeq(dds.t))
Dispersion Estimation
The dispersion estimation is adequate
plotDispEsts(dds.t)
res <- results(dds.t,name="genotype_pcp_vs_Col.0")
assume a 1% FDR
alpha=0.01
Plot the Median vs Average There are many genes that appear differentially expressed at a 1% FDR cutoff The log2 fold-change range is relatively broad, with three extreme values
plotMA(res,alpha)
Plot the log odds vs. log2 fold change
The volcano plot shows the same results as the MA plot; a large number of genes show significant fold-changes
volcanoPlot(res,alpha=alpha)
Plot the adjusted p-value histogram
Which is almost evenly distributed, with an enrichment for lower p-values (more significant)
hist(res$padj,breaks=seq(0,1,.01))
Select genes below alpha
Note the 0.5 cutoff on the log fold change is motivated by the Schurch et al. RNA, 2016 publication
sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= 0.5
sprintf("There are %s transcripts differentially expressed at a %s cutoff",
sum(sel),alpha)
## [1] "There are 1628 transcripts differentially expressed at a 0.01 cutoff"
Almost equally distributed between Wt (-1) and pcp (1)
pander(table(sign(res[sel,"log2FoldChange"])))
| -1 | 1 |
|---|---|
| 857 | 771 |
Write them out
dir.create("analysis/DESeq2/Kallisto-transcript/genotype-effect",
showWarnings=FALSE,recursive=TRUE)
write.csv(cbind(txID=rownames(res)[sel],tx.annot[sel,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/genotype-effect",
"pcp-vs-wt_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(txID=rownames(res)[sel & sign(res$log2FoldChange)==1],
tx.annot[sel & sign(res$log2FoldChange)==1,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/genotype-effect",
"pcp-up-regulated_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(txID=rownames(res)[sel & sign(res$log2FoldChange) == -1],
tx.annot[sel & sign(res$log2FoldChange) == -1,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/genotype-effect",
"wt-up-regulated_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(as.data.frame(res),tx.annot),
file=file.path("analysis/DESeq2/Kallisto-transcript/genotype-effect",
"pcp_vs_wt.csv"))
t.pcp.vs.wt <- rownames(res)[sel]
The color code for the column next to the row dendogram (genes) is yellow for significant positive fold change and darkorange for significant negative fold change
heatmap.2(as.matrix(vst.t[rownames(vst.t) %in% t.pcp.vs.wt,]),
scale="row",labRow=NA,trace="none",
RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
"darkorange",
"yellow"),
labCol=paste(colData(dds.t)$genotype,colData(dds.t)$temp,sep="-"))
res.df <- as.data.frame(res)
res.df$log10MeanNormCount <- log10(res.df$baseMean)
idx <- rowSums(counts(dds.t)) > 0
res.df <- res.df[idx,]
res.df$padj[is.na(res.df$padj)] <- 1
glMDPlot(res.df,
xval = "log10MeanNormCount",
yval = "log2FoldChange",
counts = counts(dds.t)[idx,],
anno = tx.annot[idx,],
groups = dds.t$genotype,
samples = paste(samples$Genotype,samples$Temp,samples$Replicate,sep="-"),
status = sel[idx],
display.columns = c("TranscriptID", "SYMBOL","GENENAME"),
id.column="TranscriptID",
path = "analysis/DESeq2/Kallisto-transcript/genotype-effect",
folder = "report",launch=FALSE)
res <- results(dds.t,name="finalTemp_W_vs_C")
assume a 1% FDR
alpha=0.01
There are many genes that appear differentially expressed. The log2 fold-change range is relatively broad, with two extreme positive values
plotMA(res,alpha)
Plot the log odds vs. log2 fold change
The volcano plot shows the same results as the MA plot; a large number of genes show significant fold-changes
volcanoPlot(res,alpha=alpha)
Plot the adjusted p-value histogram
Which is almost evenly distributed, with an enrichment for lower p-values (most significant)
hist(res$padj,breaks=seq(0,1,.01),xlab="FDR")
Select genes below alpha
Note the 0.5 cutoff on the log fold change is motivated by the Schurch et al. RNA, 2016 publication
sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= 0.5
sprintf("There are %s transcripts differentially expressed at a %s cutoff",
sum(sel),alpha)
## [1] "There are 1960 transcripts differentially expressed at a 0.01 cutoff"
Slightly skewed towards over-expression in cold (-1) than in warm (1)
pander(table(sign(res[sel,"log2FoldChange"])))
| -1 | 1 |
|---|---|
| 1099 | 861 |
Write them out
dir.create("analysis/DESeq2/Kallisto-transcript/temperature-effect",
showWarnings = FALSE, recursive=TRUE)
write.csv(cbind(txID=rownames(res)[sel],tx.annot[sel,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/temperature-effect",
"Warm-vs-Cold-final-temperature_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(txID=rownames(res)[sel & sign(res$log2FoldChange)==1],
tx.annot[sel & sign(res$log2FoldChange)==1,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/temperature-effect",
"Warm-final-temperature-up-regulated_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(txID=rownames(res)[sel & sign(res$log2FoldChange)==-1],
tx.annot[sel & sign(res$log2FoldChange) == -1,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/temperature-effect",
"Cold-final-temperature-up-regulated_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(as.data.frame(res),tx.annot),
file=file.path("analysis/DESeq2/Kallisto-transcript/temperature-effect",
"Warm-vs-Cold-final-temperature.csv"))
t.warm.vs.cold <- rownames(res)[sel]
The color code for the column next to the row dendogram (genes) is yellow for significant positive fold change and dark orange for significant negative fold change
heatmap.2(as.matrix(vst.t[rownames(vst.t) %in% t.warm.vs.cold,]),
scale="row",labRow=NA,trace="none",
RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
"darkorange",
"yellow"),
labCol=paste(colData(dds.t)$genotype,colData(dds.t)$temp,sep="-"))
res.df <- as.data.frame(res)
res.df$log10MeanNormCount <- log10(res.df$baseMean)
idx <- rowSums(counts(dds.t)) > 0
res.df <- res.df[idx,]
res.df$padj[is.na(res.df$padj)] <- 1
glMDPlot(res.df,
xval = "log10MeanNormCount",
yval = "log2FoldChange",
counts = counts(dds.t)[idx,],
anno = tx.annot[idx,],
groups = dds.t$genotype,
samples = paste(samples$Genotype,samples$Temp,samples$Replicate,sep="-"),
status = sel[idx],
display.columns = c("TranscriptID", "SYMBOL","GENENAME"),
id.column="TranscriptID",
path = "analysis/DESeq2/Kallisto-transcript/temperature-effect",
folder = "report",launch=FALSE)
res <- results(dds.t,name="genotypepcp.finalTempW")
assume a 1% FDR
alpha=0.01
There are not many genes that appear differentially expressed. The log2 fold-change range is relatively broad, with one extreme value.
plotMA(res,alpha)
Plot the log odds vs. log2 fold change
The volcano plot shows the same results as the MA plot; a large number of genes show large insignificant fold-changes in term of the interaction of pcp and final temperature
volcanoPlot(res,alpha=alpha)
Plot the adjusted p-value histogram
Which is almost evenly distributed, with an enrichment for higher p-values (less significant)
hist(res$padj,breaks=seq(0,1,.01),xlab="FDR")
Select genes below alpha
Note the 0.5 cutoff on the log fold change is motivated by the Schurch et al. RNA, 2016 publication
sel <- res$padj<alpha & !is.na(res$padj) & abs(res$log2FoldChange) >= 0.5
sprintf("There are %s transcripts differentially expressed at a %s cutoff",
sum(sel),alpha)
## [1] "There are 285 transcripts differentially expressed at a 0.01 cutoff"
Evenly distributed
pander(table(sign(res[sel,"log2FoldChange"])))
| -1 | 1 |
|---|---|
| 143 | 142 |
Write them out
dir.create("analysis/DESeq2/Kallisto-transcript/interaction-term",
showWarnings = FALSE, recursive = TRUE)
write.csv(cbind(txID=rownames(res)[sel],tx.annot[sel,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/interaction-term",
"pcp-final-temperature-interaction_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(txID=rownames(res)[sel & sign(res$log2FoldChange)==1],
tx.annot[sel & sign(res$log2FoldChange)==1,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/interaction-term",
"pcp-final-temperature-interaction-up-regulated_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(txID=rownames(res)[sel & sign(res$log2FoldChange)==-1],
tx.annot[sel & sign(res$log2FoldChange) == -1,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/interaction-term",
"pcp-final-temperature-interaction-down-regulated_one-percent-FDR-cutoff_significant-transcripts.txt"))
write.csv(cbind(as.data.frame(res),tx.annot),
file=file.path("analysis/DESeq2/Kallisto-transcript/interaction-term",
"pcp-final-temperature-interaction.csv"))
t.pcp.temperature <- rownames(res)[sel]
The color code for the column next to the row dendogram (genes) is yellow for significant positive fold change and dark orange for significant negative fold change
heatmap.2(as.matrix(vst.t[rownames(vst.t) %in% t.pcp.temperature,]),
scale="row",labRow=NA,trace="none",
RowSideColors=ifelse(sign(res[sel,"log2FoldChange"])==-1,
"darkorange",
"yellow"),
labCol=paste(colData(dds.t)$genotype,colData(dds.t)$temp,sep="-"))
res.df <- as.data.frame(res)
res.df$log10MeanNormCount <- log10(res.df$baseMean)
idx <- rowSums(counts(dds.t)) > 0
res.df <- res.df[idx,]
res.df$padj[is.na(res.df$padj)] <- 1
glMDPlot(res.df,
xval = "log10MeanNormCount",
yval = "log2FoldChange",
counts = counts(dds.t)[idx,],
anno = tx.annot[idx,],
groups = dds.t$genotype,
samples = paste(samples$Genotype,samples$Temp,samples$Replicate,sep="-"),
status = sel[idx],
display.columns = c("TranscriptID", "SYMBOL","GENENAME"),
id.column="TranscriptID",
path = "analysis/DESeq2/Kallisto-transcript/interaction-term",
folder = "report",launch=FALSE)
Create the eSet
eset <- ExpressionSet(as.matrix(vst.t[sel,]))
Standardise
eset.s <- standardise(eset)
Average the replicates (mean)
eset.m <- ExpressionSet(
sapply(split.data.frame(t(exprs(eset.s)),
paste0(samples$Genotype,samples$Temp)),
colMeans))
Estimate the fuzzification
m <- mestimate(eset.m)
Find the clusters (12 is based on the previous heatmap)
cl <- mfuzz(eset.m,centers=12,m=m)
Order the replicates
ord <- c(4,1:3,8,5:7)
There are a number of clusters that behave similarly
mfuzz.plot(eset.m[,ord],
cl=cl,
mfrow=c(1,1),
time.labels = colnames(eset.m)[ord],
new.window=FALSE)
Writing the core transcripts from the clusters
dir.create("analysis/DESeq2/Kallisto-transcript/interaction-term/clusters",
showWarnings = FALSE)
Not all genes will be part of the core genes from the clusters
dev.null <- sapply(1:12,function(i,d){
nam <- d[[i]]
write.csv(cbind(ID=nam,tx.annot[nam,]),
file=file.path("analysis/DESeq2/Kallisto-transcript/interaction-term/clusters",
paste("cluster",i,sep="-")))
},lapply(sapply(acore(eset.m,cl,0.5),"[[","NAME"),as.character))
plot.new()
grid.draw(venn.diagram(list(
pcp.vs.wt=t.pcp.vs.wt,
warm.vs.cold=t.warm.vs.cold,
pcp.temperature=t.pcp.temperature),
filename=NULL,
col=pal[1:3],
category.names=c("pcp.vs.wt","warm.vs.cold","pcp.by.temperature")))
The results are highly similar to those obtained at the gene level
plot.multidensity(lapply(1:ncol(vst.g),function(i){vst.g[,i]}),
col=pal[as.integer(colData(dds.g)$genotype)],
xlab="gene expression"
)
abline(v=1,col="grey",lty=3)
plot.multidensity(lapply(1:ncol(vst.t),function(i){vst.t[,i]}),
col=pal[as.integer(colData(dds.t)$genotype)],
xlab="transcript expression"
)
abline(v=1,col="grey",lty=3)
How many genes have alternative transcripts reported?
tx2gene <- read.table("/mnt/picea/storage/reference/Arabidopsis-thaliana/TAIR10/kallisto/tx2gene.txt",
header=TRUE)
tx2gene <- tx2gene[,2:1]
Is that not surprisingly little?
barplot(table(table(tx2gene$GENEID)))
Select the genes with isoforms
gwi <- names(table(tx2gene$GENEID)[table(tx2gene$GENEID) > 1])
message(paste("There are",length(gwi),"genes with splice isoforms"))
## There are 5885 genes with splice isoforms
Plotting function IMPORTANT: To be plotted, genes need to have an expression > 1 in at least 3 samples. Moreover, some samples will be removed from the plot if they have no expression
plotTx <- function(tpt,tpg,dendro,tp,e.cutoff=1,s.cutoff=3){
#options(warn=2)
col.sel <- match(dendro,colnames(vst.g))
# plot all gene transcripts where the transcript is only found DE
dev.null <- sapply(tpt[tpt %in% gwi][! tpt[tpt %in% gwi] %in% tpg],function(nam){
#heatmap.2(as.matrix(rbind(vst.t[grep(nam,rownames(vst.t)),],
# vst.g[nam,])),cexRow=0.6)
#message(nam)
row.sel <- grepl(nam,rownames(vst.t))
mat <- as.matrix(rbind(
vst.g[nam,col.sel],
vst.t[row.sel,col.sel]))
de.sel <- rownames(vst.t)[row.sel] %in% tp[grep(nam,tp)]
if(any(rowSums(mat[-1,][de.sel,,drop=FALSE] > e.cutoff)>s.cutoff)){
plotmatrix(mat,
col=c(1,pal[1:sum(row.sel)]),
lty=c(2,2 - (de.sel)),
xlabels=paste(samples$Genotype,samples$Temp)[col.sel],las=2,
main=paste(nam,"transcripts expression"))
smat <- scale(mat)
# if a sample has consistently no expression
col.skip <- is.na(colSums(smat))
plotmatrix(smat[,!col.skip],
col=c(1,pal[1:sum(row.sel)]),
lty=c(2,2 - (rownames(vst.t)[row.sel] %in% tp[grep(nam,tp)])),
xlabels=paste(samples$Genotype,samples$Temp)[col.sel][!col.skip],las=2,
main=paste(nam,"z-score"))
return(1);
}
return(0);
})
return(sum(dev.null))
}
plotG <- function(tpt,tpg,dendro,tp,e.cutoff=1,s.cutoff=3){
#options(warn=2)
col.sel <- match(dendro,colnames(vst.g))
# plot all gene transcripts where the transcript is only found DE
dev.null <- sapply(tpg[tpg %in% gwi][! tpg[tpg %in% gwi] %in% tpt],function(nam){
#heatmap.2(as.matrix(rbind(vst.t[grep(nam,rownames(vst.t)),],
# vst.g[nam,])),cexRow=0.6)
#message(nam)
row.sel <- grepl(nam,rownames(vst.t))
mat <- as.matrix(rbind(
vst.g[nam,col.sel],
vst.t[row.sel,col.sel]))
de.sel <- rownames(vst.t)[row.sel] %in% tp[grep(nam,tp)]
#if(any(rowSums(mat[-1,][de.sel,,drop=FALSE] > e.cutoff)>s.cutoff)){
plotmatrix(mat,
col=c(1,pal[1:sum(row.sel)]),
lty=c(1,2 - (de.sel)),
xlabels=paste(samples$Genotype,samples$Temp)[col.sel],las=2,
main=paste(nam,"transcripts expression"))
smat <- scale(mat)
# if a sample has consistently no expression
col.skip <- is.na(colSums(smat))
plotmatrix(smat[,!col.skip],
col=c(1,pal[1:sum(row.sel)]),
lty=c(1,2 - (rownames(vst.t)[row.sel] %in% tp[grep(nam,tp)])),
xlabels=paste(samples$Genotype,samples$Temp)[col.sel][!col.skip],las=2,
main=paste(nam,"z-score"))
# return(1);
#}
#return(0);
})
#return(sum(dev.null))
}
tpt <- unique(sub("\\.[0-9]+$","",t.pcp.vs.wt))
plot.new()
grid.draw(venn.diagram(list(
gene=pcp.vs.wt[pcp.vs.wt %in% gwi],
transcript=tpt[tpt %in% gwi]),
filename=NULL,
col=pal[1:2],
category.names=c("gene","transcript")))
Order the samples as in the heatmap
dendro <- as.character(c(2,1,3,
37,39,38,
25,26,27,
13,14,15,
30,28,29,
18,17,16,
42,40,41,
5,6,4))
pdf("analysis/DESeq2/Kallisto-transcript/non-de-gene-transcript-abundance_by-genotype.pdf",height=12,width=8)
par(mfrow=c(3,2))
message(paste("There are",
plotTx(tpt,pcp.vs.wt,dendro,t.pcp.vs.wt,s.cutoff=6),
"transcripts passing the thresholds"))
## There are 227 transcripts passing the thresholds
dev.off()
## png
## 2
pdf("analysis/DESeq2/Kallisto-transcript/de-gene-transcript-abundance_by-genotype.pdf",height=12,width=8)
par(mfrow=c(3,2))
dev.null <- plotG(tpt,pcp.vs.wt,dendro,t.pcp.vs.wt)
dev.off()
## png
## 2
tpt <- unique(sub("\\.[0-9]+$","",t.warm.vs.cold))
plot.new()
grid.draw(venn.diagram(list(
gene=warm.vs.cold[warm.vs.cold %in% gwi],
transcript=tpt[tpt %in% gwi]),
filename=NULL,
col=pal[1:2],
category.names=c("gene","transcript")))
Order the samples as in the heatmap
dendro <- as.character(c(13,14,15,
25,26,27,
30,28,29,
18,17,16,
42,40,41,
5,6,4,
2,1,3,
37,39,38))
pdf("analysis/DESeq2/Kallisto-transcript/non-de-gene-transcript-abundance_by-temperature.pdf",height=12,width=8)
par(mfrow=c(3,2))
message(paste("There are",
plotTx(tpt,warm.vs.cold,dendro,t.warm.vs.cold,s.cutoff=6),
"transcripts passing the thresholds"))
## There are 144 transcripts passing the thresholds
dev.off()
## png
## 2
pdf("analysis/DESeq2/Kallisto-transcript/de-gene-transcript-abundance_by-temperature.pdf",height=12,width=8)
par(mfrow=c(3,2))
dev.null <- plotG(tpt,warm.vs.cold,dendro,t.warm.vs.cold)
dev.off()
## png
## 2
tpt <- unique(sub("\\.[0-9]+$","",t.pcp.temperature))
plot.new()
grid.draw(venn.diagram(list(
gene=pcp.temperature[pcp.temperature %in% gwi],
transcript=tpt[tpt %in% gwi]),
filename=NULL,
col=pal[1:2],
category.names=c("gene","transcript")))
Order the samples as in the heatmap
dendro <- as.character(c(15,13,14,
25,27,26,
18,17,16,
30,29,28,
3,2,1,
4,5,6,
42,41,40,
37,38,39))
pdf("analysis/DESeq2/Kallisto-transcript/non-de-gene-transcript-abundance_by-interaction.pdf",height=12,width=8)
par(mfrow=c(3,2))
message(paste("There are", plotTx(tpt,pcp.temperature,dendro,t.pcp.temperature),"transcripts passing the thresholds"))
## There are 27 transcripts passing the thresholds
dev.off()
## png
## 2
pdf("analysis/DESeq2/Kallisto-transcript/de-gene-transcript-abundance_by-interaction.pdf",height=12,width=8)
par(mfrow=c(3,2))
dev.null <- plotG(tpt,pcp.temperature,dendro,t.pcp.temperature)
dev.off()
## png
## 2
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.1 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] VennDiagram_1.6.17 futile.logger_1.4.3
## [3] RColorBrewer_1.1-2 pander_0.6.0
## [5] org.At.tair.db_3.4.0 AnnotationDbi_1.36.0
## [7] Mfuzz_2.34.0 e1071_1.6-7
## [9] LSD_3.0 Glimma_1.2.1
## [11] gplots_3.0.1 DESeq2_1.14.1
## [13] SummarizedExperiment_1.4.0 Biobase_2.34.0
## [15] GenomicRanges_1.26.1 GenomeInfoDb_1.10.1
## [17] IRanges_2.8.1 S4Vectors_0.12.1
## [19] BiocGenerics_0.20.0
##
## loaded via a namespace (and not attached):
## [1] edgeR_3.16.5 splines_3.3.2 gtools_3.5.0
## [4] Formula_1.2-1 assertthat_0.1 latticeExtra_0.6-28
## [7] yaml_2.1.14 RSQLite_1.1-1 backports_1.0.4
## [10] lattice_0.20-34 limma_3.30.7 digest_0.6.10
## [13] XVector_0.14.0 colorspace_1.3-2 htmltools_0.3.5
## [16] Matrix_1.2-7.1 plyr_1.8.4 XML_3.98-1.5
## [19] genefilter_1.56.0 zlibbioc_1.20.0 xtable_1.8-2
## [22] scales_0.4.1 gdata_2.17.0 BiocParallel_1.8.1
## [25] htmlTable_1.7 tibble_1.2 openssl_0.9.5
## [28] annotate_1.52.0 ggplot2_2.2.0 nnet_7.3-12
## [31] lazyeval_0.2.0 survival_2.40-1 magrittr_1.5
## [34] memoise_1.0.0 evaluate_0.10 foreign_0.8-67
## [37] class_7.3-14 tools_3.3.2 data.table_1.10.0
## [40] stringr_1.1.0 munsell_0.4.3 locfit_1.5-9.1
## [43] cluster_2.0.5 lambda.r_1.1.9 base64_2.0
## [46] caTools_1.17.1 RCurl_1.95-4.8 bitops_1.0-6
## [49] rmarkdown_1.2 gtable_0.2.0 DBI_0.5-1
## [52] gridExtra_2.2.1 knitr_1.15.1 Hmisc_4.0-1
## [55] rprojroot_1.1 futile.options_1.0.0 KernSmooth_2.23-15
## [58] stringi_1.1.2 Rcpp_0.12.8 geneplotter_1.52.0
## [61] rpart_4.1-10 acepack_1.4.1